setwd("~/Desktop/working-with-lyle/Formality_Project")#set our WD
if (!require("pacman")) install.packages("pacman") #run this if you don't have pacman
library(pacman)
pacman::p_load(tidyverse,rlang, zoo, lubridate, plotrix, ggpubr, caret, broom, kableExtra, reactable, install = T)
#use pacman to load packages quickly palette_map = c("#3B9AB2", "#EBCC2A", "#F21A00")
palette_condition = c("#ee9b00", "#bb3e03", "#005f73")
plot_aes = theme_classic() +
theme(legend.position = "top",
legend.text = element_text(size = 12),
text = element_text(size = 16, family = "Futura Medium"),
axis.text = element_text(color = "black"),
axis.line = element_line(colour = "black"),
axis.ticks.y = element_blank()) table_model = function(model_data) {
model_data %>%
tidy() %>%
rename("SE" = std.error,
"t" = statistic,
"p" = p.value) %>%
kable() %>%
kableExtra::kable_styling()
}df <- read_csv('books_FK.csv') #read in the dataThat is, one row per year, per variable
tidy_df <- df %>%
group_by(ORIG_PUBL_DATE) %>% ###grouping by the year
summarise_at(vars("readability", "grade_level"), funs(mean, std.error),) #pulling the means and SEs for our variables of interest
# Get the mean values for the year 1933
year_means <- tidy_df %>%
filter(ORIG_PUBL_DATE == 1933)
#create centered variablles
tidy_df$readability_centered <- tidy_df$readability_mean - 85.97
tidy_df$grade_level_centered <- tidy_df$grade_level_mean - 4.996Flesch-Kincaid Ease of Readability: higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read.
The Flesch–Kincaid Grade Level Score: presents a score as a U.S. grade level, making it easier for teachers, parents, librarians, and others to judge the readability level of various books and texts.
The following corpus consists of 599 books, from 599 authors, ranging from 1933 to 2020.
Please note that these analyses were conducted on the folder titled text-plain!
df %>%
select(filename) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)auth_sex <- df %>%
select(AUTH_GENDER,filename) %>%
unique() %>%
group_by(AUTH_GENDER) %>%
dplyr::summarize(n = n()) %>%
reactable::reactable(striped = TRUE)
auth_sexM and F = male AND Female authors wrote the book; not an aggregate of males and females.
Read_sex <- df %>%
select(AUTH_GENDER,readability) %>%
unique() %>%
group_by(AUTH_GENDER) %>%
dplyr::summarize(mean = mean(readability)) %>%
reactable::reactable(striped = TRUE)
Read_sexGrade_sex <- df %>%
select(AUTH_GENDER,grade_level) %>%
unique() %>%
group_by(AUTH_GENDER) %>%
dplyr::summarize(mean = mean(grade_level)) %>%
reactable::reactable(striped = TRUE)
Grade_sexPlease see attached files for the graphs if needed.
#Plot our smoothed data
#we are using Non-tidy data here to capture the individual variation
#readability
readability_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=readability, group=1)) +
ggtitle("Readability") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.70 )+
plot_aes +
labs(x = "Year", y = 'Ease of Readability') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=100,label="
estimate = 0.0796
p-value < 0.001
", size = 3.5)
#grade level
grade_level_smooth <- ggplot(data=df, aes(x=ORIG_PUBL_DATE, y=grade_level, group=1)) +
ggtitle("Reading Grade Level") +
geom_point(color = "dodgerblue3", alpha = 0.5) +
geom_smooth(method = "loess", span = 0.70 )+
plot_aes +
labs(x = "Year", y = 'Reading Grade Level') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=12,label="
estimate = -0.0199
p-value < 0.001
", size = 3.5)
smooth_graphs <- ggpubr::ggarrange(readability_smooth,grade_level_smooth,ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(smooth_graphs,
top = text_grob("Smooth Readability Graphs", color = "black", face = "bold", size = 20),
bottom = text_grob(
"Note. Horizontal shading represents Standard Error."
, color = "Black",
hjust = 1.05, x = 1, face = "italic", size = 12))readability_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=readability_mean, group=1)) +
ggtitle("Readability") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.90 )+
plot_aes +
labs(x = "Year", y = 'Ease of Readability') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=87,label="
estimate = 0.0745
p-value < 0.001
", size = 3.5)
grade_smooth_tidy <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=grade_level_mean, group=1)) +
ggtitle("Grade Level") +
geom_point(color = "dodgerblue3", alpha = 0.7) +
geom_smooth(method = "loess", span = 0.90 )+
plot_aes +
labs(x = "Year", y = 'Grade Level Score') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold")) +
annotate(geom="text",x=1935,
y=8,label="
estimate = -0.0192
p-value < 0.001
", size = 3.5)
tidy_smooth_graphs <- ggpubr::ggarrange(readability_smooth_tidy,grade_smooth_tidy, ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(tidy_smooth_graphs,
top = text_grob("Smooth Flesch-Kincaid", color = "black", face = "bold", size = 20),
bottom = text_grob(
"Note. Horizontal shading represents Standard Error.
Estimates displayed are from mean centered analyses (data centered on 1933)"
, color = "Black",
hjust = 1, x = 1, face = "italic", size = 12))Readability <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=readability_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=readability_mean-readability_std.error, ymax=readability_mean+readability_std.error), alpha=0.2) +
ggtitle("Readbility") +
plot_aes +
labs(x = "Year", y = 'Ease of Readbility') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
grade_level <- ggplot(data=tidy_df, aes(x=ORIG_PUBL_DATE, y=grade_level_mean, group=1)) +
geom_line(colour = "dodgerblue3") +
geom_ribbon(aes(ymin=grade_level_mean-grade_level_std.error, ymax=grade_level_mean+grade_level_std.error), alpha=0.2) +
ggtitle("Grade Level") +
plot_aes +
labs(x = "Year", y = 'Flesch-Kincaid Grade Level') +
theme(axis.text.x=element_text(angle=45, hjust=1),
plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 16)) +
theme(axis.text=element_text(size=16),
axis.title=element_text(size=20,face="bold"))+
theme(plot.title.position = 'plot',
plot.title = element_text(hjust = 0.5, face = "bold", size = 20)) +
theme(axis.text=element_text(size = 14),
axis.title=element_text(size = 20,face="bold"))
#raw graphs
raw_graphs <- ggpubr::ggarrange(Readability, grade_level, ncol=1, nrow=2, common.legend = TRUE, legend = "bottom")
annotate_figure(raw_graphs,
top = text_grob("Raw Flesch-Kincaid Graphs (grouped by year)", color = "black", face = "bold", size = 20),
bottom = text_grob("Note. Horizontal shading represents Standard Error. "
, color = "Black",
hjust = 1.05, x = 1, face = "italic", size = 16))Models presented in order: Raw data, aggregated by year, centered on 1857
#Raw Data
Readability_RAW <- lm(readability ~ ORIG_PUBL_DATE, data = df)
#Tidy Data
Readability_TIDY <- lm(readability_mean ~ ORIG_PUBL_DATE, data = tidy_df)
Readability_centered <- lm(readability_centered ~ORIG_PUBL_DATE, data = tidy_df)
table_model(Readability_RAW)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -73.6851 | 26.1862 | -2.814 | 0.0051 |
| ORIG_PUBL_DATE | 0.0796 | 0.0132 | 6.021 | 0.0000 |
table_model(Readability_TIDY)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -63.6450 | 24.9021 | -2.556 | 0.0124 |
| ORIG_PUBL_DATE | 0.0745 | 0.0126 | 5.912 | 0.0000 |
table_model(Readability_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | -149.6150 | 24.9021 | -6.008 | 0 |
| ORIG_PUBL_DATE | 0.0745 | 0.0126 | 5.912 | 0 |
#Raw Data
Grade_RAW <- lm(grade_level ~ ORIG_PUBL_DATE, data = df)
#Tidy Data
Grade_TIDY <- lm(grade_level_mean ~ ORIG_PUBL_DATE, data = tidy_df)
Grade_centered <- lm(grade_level_centered ~ORIG_PUBL_DATE, data = tidy_df)
table_model(Grade_RAW)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 44.6362 | 5.7299 | 7.790 | 0 |
| ORIG_PUBL_DATE | -0.0199 | 0.0029 | -6.881 | 0 |
table_model(Grade_TIDY)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 43.1663 | 5.3006 | 8.144 | 0 |
| ORIG_PUBL_DATE | -0.0192 | 0.0027 | -7.144 | 0 |
table_model(Grade_centered)| term | estimate | SE | t | p |
|---|---|---|---|---|
| (Intercept) | 38.1703 | 5.3006 | 7.201 | 0 |
| ORIG_PUBL_DATE | -0.0192 | 0.0027 | -7.144 | 0 |